Data<- read.csv(file = 'Final/BoT_Cluster_ENGR_Updated.csv',row.names=1)
BoT <- Data[,-c(1,2)]
dcan <- dist(BoT, method = "euclidean")
hcan <- hclust(dcan,method="complete")
plot(hcan, axes = FALSE, ann = FALSE, main = NA, labels = FALSE, hang = 0.01)
We can see from the figure that the best choice for total number of clusters is 3. Comparing it with the original cohort id.
HC_cut <- cutree(hcan, 3)
table(Data$cohort_id, HC_cut)
## HC_cut
## 1 2 3
## 1 28 23 5
## 6 25 25 8
## 10 36 37 13
## 11 2 10 2
## 12 17 12 3
## 15 27 21 7
## 16 15 15 1
## 17 21 21 4
## 22 60 78 19
## 28 16 26 8
## 30 17 26 3
## 35 23 27 5
## 39 21 30 9
## 40 19 20 7
## 41 26 16 1
We will save the result to perform test.
Data_HC_Result <- BoT %>% mutate(Cluster = HC_cut)
Data_HC_Result_Summarized <- BoT %>%
mutate(Cluster = HC_cut) %>%
group_by(Cluster) %>%
summarise_all("mean")
paged_table(Data_HC_Result_Summarized)
write.csv(Data_HC_Result, "Final/Data_HC_Result.csv", row.names=FALSE)
write.csv(Data_HC_Result_Summarized, "Final/Data_HC_Result_Summarized.csv", row.names=FALSE)
Look at the histogram for each cluster
plt <- htmltools::tagList()
for(i in 1:8){
plt[[i]] <- as.widget(ggplotly(ggplot(Data_HC_Result,
aes(x=as.factor(.data[[colnames(Data_HC_Result)[i]]]), fill=as.factor(Cluster), color=as.factor(Cluster))) +
geom_histogram(position="dodge",stat="count") ))
}
Using Kruskal-Wallis test as the non-parametric alternative to one-way ANOVA test, since I believe my data is not normally distributed. Also, the original data is of ordinal type.
for(i in 1:8){
print(kruskal.test(Data_HC_Result[[colnames(Data_HC_Result)[i]]] ~ Data_HC_Result$Cluster, data = Data_HC_Result))
}
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 77.025, df = 2, p-value < 2.2e-16
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 286.55, df = 2, p-value < 2.2e-16
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 170.19, df = 2, p-value < 2.2e-16
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 4.612, df = 2, p-value = 0.09966
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 25.141, df = 2, p-value = 3.473e-06
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 3.3634, df = 2, p-value = 0.1861
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 115.62, df = 2, p-value < 2.2e-16
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 69.588, df = 2, p-value = 7.747e-16
Post-hoc test chosen: Pairwise Wilcoxon test
# pairwise.wilcox.test(Data_HC_Result[["Control"]], Data_HC_Result$Cluster, p.adjust.method = "BH")
for(i in 1:8){
print(pairwise.wilcox.test(Data_HC_Result[[colnames(Data_HC_Result)[i]]], Data_HC_Result$Cluster, p.adjust.method = "BH"))
}
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 0.092 -
## 3 2.5e-12 < 2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 <2e-16 0.04
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 5e-11 -
## 3 <2e-16 <2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 0.12 -
## 3 0.26 0.99
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 0.00085 -
## 3 1.6e-05 0.01404
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 0.64 -
## 3 0.19 0.19
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 0.3 1e-14
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] and Data_HC_Result$Cluster
##
## 1 2
## 2 5.0e-14 -
## 3 1.3e-08 1
##
## P value adjustment method: BH
The “BH” (aka “fdr”) and “BY” method of Benjamini, Hochberg, and Yekutieli control the false discovery rate, the expected proportion of false discoveries amongst the rejected hypotheses. The false discovery rate is a less stringent condition than the family-wise error rate, so these methods are more powerful than the others.
ggplotly(fviz_nbclust(BoT, kmeans, method = "wss"))
ggplotly(fviz_nbclust(BoT, kmeans, method = "sil"))
fviz_nbclust(BoT, kmeans, method = "gap_stat")
We can see from various figures that the best choice for total number of clusters is either 2 (wss and silhouette) or 3 (gap_stats and dendogram from HC). For this paper, we will choose 3 clusters for consistency with HC.
k3 <- kmeans(BoT, centers = 3, nstart = 25)
fviz_cluster(k3, data = BoT,geom = "point")
table(Data$cohort_id, k3$cluster)
##
## 1 2 3
## 1 23 16 17
## 6 13 23 22
## 10 16 30 40
## 11 4 8 2
## 12 11 6 15
## 15 18 16 21
## 16 7 10 14
## 17 10 16 20
## 22 39 69 49
## 28 9 29 12
## 30 15 15 16
## 35 18 19 18
## 39 19 22 19
## 40 11 21 14
## 41 20 10 13
We will save the result to perform test.
Data_Kmeans_Result <- BoT %>% mutate(Cluster = k3$cluster)
Data_Kmeans_Result_Summarized <- BoT %>%
mutate(Cluster = k3$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
paged_table(Data_Kmeans_Result_Summarized)
write.csv(Data_Kmeans_Result, "Final/Data_Kmeans_Result.csv", row.names=FALSE)
write.csv(Data_Kmeans_Result_Summarized, "Final/Data_Kmeans_Result_Summarized.csv", row.names=FALSE)
Look at the histogram for each cluster
plt2 <- htmltools::tagList()
for(i in 1:8){
plt2[[i]] <- as.widget(ggplotly(ggplot(Data_Kmeans_Result,
aes(x=as.factor(.data[[colnames(Data_Kmeans_Result)[i]]]), fill=as.factor(Cluster), color=as.factor(Cluster))) +
geom_histogram(position="dodge",stat="count") ))
}
Using Kruskal-Wallis test as the non-parametric alternative to one-way ANOVA test, since I believe my data is not normally distributed. Also, the original data is of ordinal type.
for(i in 1:8){
print(kruskal.test(Data_HC_Result[[colnames(Data_HC_Result)[i]]] ~ Data_HC_Result$Cluster, data = Data_HC_Result))
}
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 77.025, df = 2, p-value < 2.2e-16
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 286.55, df = 2, p-value < 2.2e-16
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 170.19, df = 2, p-value < 2.2e-16
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 4.612, df = 2, p-value = 0.09966
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 25.141, df = 2, p-value = 3.473e-06
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 3.3634, df = 2, p-value = 0.1861
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 115.62, df = 2, p-value < 2.2e-16
##
##
## Kruskal-Wallis rank sum test
##
## data: Data_HC_Result[[colnames(Data_HC_Result)[i]]] by Data_HC_Result$Cluster
## Kruskal-Wallis chi-squared = 69.588, df = 2, p-value = 7.747e-16
Post-hoc test chosen: Pairwise Wilcoxon test
# pairwise.wilcox.test(Data_HC_Result[["Control"]], Data_HC_Result$Cluster, p.adjust.method = "BH")
for(i in 1:8){
print(pairwise.wilcox.test(Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]], Data_Kmeans_Result$Cluster, p.adjust.method = "BH"))
}
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 <2e-16 <2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 0.43 <2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 0.0057 <2e-16
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 3.3e-09 -
## 3 0.55 3.7e-12
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 0.12 -
## 3 9.2e-07 9.8e-12
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 0.46 -
## 3 6.1e-13 2.1e-15
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 0.00073 -
## 3 1.4e-05 0.13691
##
## P value adjustment method: BH
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: Data_Kmeans_Result[[colnames(Data_Kmeans_Result)[i]]] and Data_Kmeans_Result$Cluster
##
## 1 2
## 2 <2e-16 -
## 3 0.23 <2e-16
##
## P value adjustment method: BH
The “BH” (aka “fdr”) and “BY” method of Benjamini, Hochberg, and Yekutieli control the false discovery rate, the expected proportion of false discoveries amongst the rejected hypotheses. The false discovery rate is a less stringent condition than the family-wise error rate, so these methods are more powerful than the others.